
pacman::p_load(tidyverse,
               fs,
               fst,
               assertthat,
               tictoc)

training_qtrs <- seq.Date(as.Date(TRAINING_START_QTR), as.Date(TRAINING_END_QTR), by = "quarter")

file_info <- tibble(file_name = dir_ls("../../data_qtr_rand_no_cid_with_outcome/")) %>% 
  mutate(
    qtr = ymd(str_extract(file_name, "[0-9]{8}")),
    rand_no_cid = str_extract(file_name, "[0-9]{4}(?=.fst)")
  )

training_files <- file_info %>% 
  filter(qtr %in% training_qtrs, rand_no_cid %in% RAND_NO_CID)



are_equal(unique(training_files$rand_no_cid), RAND_NO_CID) %>% 
  stopifnot("Desired training IDs not present in pulled files" = .)



tic(msg = "Loading Training")
df_training <- map_dfr(training_files$file_name, function(x) {
  print(x) 
  
  if (LLH_COND == TRUE) {
    
    cat("Applying LLH condition\n")
    x_open_acc <- str_replace(x, "data_qtr_rand_no_cid_with_outcome", "data/open_accounts")
    df_open_acc <- read_fst(x_open_acc, columns = c("cid", "qtr", "open_acc_has_succesive_2qtr_within_8qtr")) %>% 
      filter(open_acc_has_succesive_2qtr_within_8qtr == 1) %>%
      select(cid, qtr) 
    
    read_fst(x) %>% 
      mutate(
        rand_no_cid = str_extract(x, "[0-9]{4}(?=.fst)")
      ) %>% 
      inner_join(df_open_acc, by = c("cid", "qtr")) %>% 
      filter(consumer_category %in% CONSUMER_GROUP)  # there are redundant lines in individual scripts
    
  } else {
    
    read_fst(x) %>% 
      mutate(
        rand_no_cid = str_extract(x, "[0-9]{4}(?=.fst)")
      ) %>% 
      filter(consumer_category %in% CONSUMER_GROUP) # there are redundant lines in individual scripts
    
  }
  



})
toc()



mean_t_default <- df_training %>% 
  pull(t_default) %>% 
  mean()

are_equal(unique(df_training$qtr) %>% sort(), training_qtrs %>% sort()) %>% 
  stopifnot("Desired training year-qtrs not present in pulled files" = .)

b_mean_above_0 <- mean_t_default > 0
stopifnot("There are no defaulters in the training set" = b_mean_above_0)

table(df_training$t_default)

write_fst(df_training, paste0("../../data/pipeline_outputs/", 
                              SPECIAL_SUFFIX, "/", "train_",
                              TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/",
                              "training.fst"))

